Scikit-Learn is simple

Classification


In [1]:
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split


iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [2]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Transformations


In [3]:
from sklearn.decomposition import PCA

In [4]:
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)

Tools

Cross-validation scoring


In [5]:
import numpy as np
np.set_printoptions(precision=2)

In [6]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
scores = cross_val_score(SVC(), X_train, y_train, cv=5)
print(scores)


[ 0.92  1.    0.96  1.    1.  ]

In [7]:
from sklearn.cross_validation import ShuffleSplit
cv_ss = ShuffleSplit(len(X_train))
scores_shuffle_split = cross_val_score(SVC(), X_train, y_train, cv=cv_ss)
print(scores_shuffle_split)


[ 1.    1.    1.    1.    1.    0.83  1.    0.92  1.    0.92]

In [8]:
from sklearn.cross_validation import LeaveOneLabelOut
labels = np.arange(len(X_train)) % 3
cv_label = LeaveOneLabelOut(labels)
scores_pout = cross_val_score(SVC(), X_train, y_train, cv=cv_label)

Cross-validated grid-searches


In [9]:
import numpy as np
from sklearn.grid_search import GridSearchCV
param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma': 10. ** np.arange(-3, 3)}
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.score(X_test, y_test))


{'C': 100.0, 'gamma': 0.01}
1.0

Pipelining


In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), SVC())
pipe.fit(X_train, y_train)
pipe.predict(X_test)


Out[10]:
array([0, 0, 1, 2, 0, 2, 0, 1, 0, 2, 2, 1, 2, 2, 0, 2, 1, 2, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1])